In [7]:
import pandas as pd
import numpy
import json
from collections import defaultdict
from matplotlib.pylab import style
style.use('fivethirtyeight')
%pylab inline
java_min_int = -2147483648


Populating the interactive namespace from numpy and matplotlib

In [8]:
allrecs = pd.read_csv('snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv',na_values=[java_min_int])

In [3]:
def split_column(q_str):
    if type(q_str) is float:
        if numpy.isnan(q_str):
            return q_str 
    if type(q_str) is str:
        qs = q_str.split('|')
        return qs[0] #cos the format will always end with a |

In [4]:
for col in ['place_of_birth','gender', 'citizenship','ethnic_group']:
    allrecs[col] = allrecs[col].apply(split_column)

In [5]:
allrecs.head(5)


Out[5]:
qid dob dod gender ethnic_group citizenship place_of_birth site_links
0 Q23 1732 1799 Q6581097 NaN Q30 Q494413 zhwiki|kywiki|euwiki|plwiki|bswiki|angwiki|uzw...
1 Q42 1952 2001 Q6581097 NaN Q145 Q350 zhwiki|jvwiki|euwiki|plwiki|bswiki|eswiki|tawi...
2 Q207 1946 NaN Q6581097 NaN Q30 Q49145 uzwiki|eswiki|kowikiquote|huwiki|liwikiquote|p...
3 Q297 NaN 1660 Q6581097 NaN Q29 Q8717 zhwiki|kywiki|plwiki|euwiki|bswiki|uzwiki|eswi...
4 Q326 1942 NaN Q6581097 NaN Q298 Q2887 zhwiki|plwiki|euwiki|kowiki|frwiki|eswiki|yowi...

Explanation of Aggregation Maps

#todo what about mechanical maps

  • pobs_map.json
    • pob qid $\mapsto$ country qid
  • citizenships_map.csv
    • citizenship $\leftrightarrow$ english country name
  • country_maps.csv
    • country qid $\leftrightarrow$ english country name $\leftrightarrow$ world culture

In [6]:
pobs_map = json.load(open('helpers/aggregation_maps/pobs_map.json','r'))
country_map = pd.DataFrame.from_csv('helpers/aggregation_maps/country_maps.csv')

ethnic_group_map = json.load(open('helpers/aggregation_maps/mechanical_turk/ethnic_groups_map.json','r'))
citizenship_map = json.load(open('helpers/aggregation_maps/mechanical_turk/citizenship_map.json','r')) 

def map_pob(qid):
    if not type(qid) is str:
        return None
    else:
        country_list = pobs_map[qid]
        if len(country_list) == 0:
            return None
        else:
            country = country_list[0] #assumption
            culture = country_map.ix[country]['culture_name']
            return culture

def map_wrapper(m):
    def return_fun(qid):
        try:
            return m[qid]
        except KeyError:
            return None
    return return_fun

mismatch = pd.DataFrame()


#order is important because it determines the preference we will use
col_map_fun = zip(['ethnic_group', 'citizenship', 'place_of_birth'],
                  [map_wrapper(ethnic_group_map),map_wrapper(citizenship_map), map_pob])

def determine_culture(row):
    culture = None
    for col, map_fun in col_map_fun:
        guess = map_fun(row[col])
        if (culture is not None) and (guess is not None):
            if culture != guess:
                mismatch.append(row,ignore_index=True)
        if guess:
            culture = guess
    
    return str(culture).lower() if culture else culture #to return None properly

In [173]:
%%timeit -r 1 -n 1
allrecs.iloc[0:2500].apply(lambda x: determine_culture(x), axis=1)


1 loops, best of 1: 1.77 s per loop

In [174]:
%%timeit -r 1 -n 1
allrecs.iloc[0:25000].apply(lambda x: determine_culture(x), axis=1)


1 loops, best of 1: 17 s per loop

In [17]:
allrecs['culture'] = allrecs.apply(lambda x: determine_culture(x), axis=1)


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-17-5dc819b596ea> in <module>()
----> 1 allrecs['culture'] = allrecs.apply(lambda x: determine_culture(x), axis=1)

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
   3594                     if reduce is None:
   3595                         reduce = True
-> 3596                     return self._apply_standard(f, axis, reduce=reduce)
   3597             else:
   3598                 return self._apply_broadcast(f, axis)

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in _apply_standard(self, func, axis, ignore_failures, reduce)
   3646                 labels = self._get_agg_axis(axis)
   3647                 result = lib.reduce(values, func, axis=axis, dummy=dummy,
-> 3648                                     labels=labels)
   3649                 return Series(result, index=labels)
   3650             except Exception:

/usr/local/lib/python2.7/dist-packages/pandas/lib.so in pandas.lib.reduce (pandas/lib.c:40234)()

/usr/local/lib/python2.7/dist-packages/pandas/lib.so in pandas.lib.Reducer.get_result (pandas/lib.c:30025)()

<ipython-input-17-5dc819b596ea> in <lambda>(x)
----> 1 allrecs['culture'] = allrecs.apply(lambda x: determine_culture(x), axis=1)

<ipython-input-16-ff2687e8a0fd> in determine_culture(row)
     38         if (culture is not None) and (guess is not None):
     39             if culture != guess:
---> 40                 mismatch.append(row,ignore_index=True)
     41         if guess:
     42             culture = guess

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in append(self, other, ignore_index, verify_integrity)
   3818             to_concat = [self, other]
   3819         return concat(to_concat, ignore_index=ignore_index,
-> 3820                       verify_integrity=verify_integrity)
   3821 
   3822     def join(self, other, on=None, how='left', lsuffix='', rsuffix='',

/usr/local/lib/python2.7/dist-packages/pandas/tools/merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy)
    723                        verify_integrity=verify_integrity,
    724                        copy=copy)
--> 725     return op.get_result()
    726 
    727 

/usr/local/lib/python2.7/dist-packages/pandas/tools/merge.pyc in get_result(self)
    894 
    895             new_data = concatenate_block_managers(
--> 896                 mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=self.copy)
    897             if not self.copy:
    898                 new_data._consolidate_inplace()

/usr/local/lib/python2.7/dist-packages/pandas/core/internals.pyc in concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy)
   4044                                                 copy=copy),
   4045                          placement=placement)
-> 4046               for placement, join_units in concat_plan]
   4047 
   4048     return BlockManager(blocks, axes)

/usr/local/lib/python2.7/dist-packages/pandas/core/internals.pyc in concatenate_join_units(join_units, concat_axis, copy)
   4133         raise AssertionError("Concatenating join units along axis0")
   4134 
-> 4135     empty_dtype, upcasted_na = get_empty_dtype_and_na(join_units)
   4136 
   4137     to_concat = [ju.get_reindexed_values(empty_dtype=empty_dtype,

/usr/local/lib/python2.7/dist-packages/pandas/core/internals.pyc in get_empty_dtype_and_na(join_units)
   4072             has_none_blocks = True
   4073         else:
-> 4074             dtypes[i] = unit.dtype
   4075 
   4076     # dtypes = set()

/usr/local/lib/python2.7/dist-packages/pandas/lib.so in pandas.lib.cache_readonly.__get__ (pandas/lib.c:40766)()

/usr/local/lib/python2.7/dist-packages/pandas/core/internals.pyc in dtype(self)
   4343             raise AssertionError("Block is None, no dtype")
   4344 
-> 4345         if not self.needs_filling:
   4346             return self.block.dtype
   4347         else:

/usr/local/lib/python2.7/dist-packages/pandas/lib.so in pandas.lib.cache_readonly.__get__ (pandas/lib.c:40766)()

/usr/local/lib/python2.7/dist-packages/pandas/core/internals.pyc in needs_filling(self)
   4333         for indexer in self.indexers.values():
   4334             # FIXME: cache results of indexer == -1 checks.
-> 4335             if (indexer == -1).any():
   4336                 return True
   4337 

KeyboardInterrupt: 

In [195]:
print mismatch


Empty DataFrame
Columns: []
Index: []

In [176]:
allrecs.to_json('helpers/world_cultures_shortcut.json')

In [5]:
allrecs = pd.DataFrame.from_dict(json.load(open('helpers/world_cultures_shortcut.json','r')))

In [201]:
import scipy.stats
scipy.stats.spearmanr(rank_compare[['Rank','Rank_wikidata']])


Out[201]:
(0.09637690726400637, 0.25388210576052661)

In [207]:
scipy.stats.mannwhitneyu(rank_compare['Rank'],rank_compare['Rank_wikidata'])


Out[207]:
(10078.0, 0.49798226262171613)

In [208]:
scipy.stats.ranksums(rank_compare['Rank'],rank_compare['Rank_wikidata'])


Out[208]:
(0.0057801597300572065, 0.99538812547307132)

In [205]:
print rank_compare.to_html()


<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Economy</th>
      <th>Rank</th>
      <th>Rank_wikidata</th>
      <th>diff</th>
      <th>abs_diff</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0  </th>
      <td>                    Iceland</td>
      <td>   1</td>
      <td>  73</td>
      <td> -72</td>
      <td>  72</td>
    </tr>
    <tr>
      <th>1  </th>
      <td>                    Finland</td>
      <td>   2</td>
      <td>  49</td>
      <td> -47</td>
      <td>  47</td>
    </tr>
    <tr>
      <th>2  </th>
      <td>                     Norway</td>
      <td>   3</td>
      <td>  58</td>
      <td> -55</td>
      <td>  55</td>
    </tr>
    <tr>
      <th>3  </th>
      <td>                     Sweden</td>
      <td>   4</td>
      <td>  32</td>
      <td> -28</td>
      <td>  28</td>
    </tr>
    <tr>
      <th>4  </th>
      <td>                    Denmark</td>
      <td>   5</td>
      <td>  59</td>
      <td> -54</td>
      <td>  54</td>
    </tr>
    <tr>
      <th>5  </th>
      <td>                  Nicaragua</td>
      <td>   6</td>
      <td>  25</td>
      <td> -19</td>
      <td>  19</td>
    </tr>
    <tr>
      <th>6  </th>
      <td>                     Rwanda</td>
      <td>   7</td>
      <td>  67</td>
      <td> -60</td>
      <td>  60</td>
    </tr>
    <tr>
      <th>7  </th>
      <td>                    Ireland</td>
      <td>   8</td>
      <td>  77</td>
      <td> -69</td>
      <td>  69</td>
    </tr>
    <tr>
      <th>8  </th>
      <td>                Philippines</td>
      <td>   9</td>
      <td>   2</td>
      <td>   7</td>
      <td>   7</td>
    </tr>
    <tr>
      <th>9  </th>
      <td>                    Belgium</td>
      <td>  10</td>
      <td>  98</td>
      <td> -88</td>
      <td>  88</td>
    </tr>
    <tr>
      <th>10 </th>
      <td>                Switzerland</td>
      <td>  11</td>
      <td> 116</td>
      <td>-105</td>
      <td> 105</td>
    </tr>
    <tr>
      <th>11 </th>
      <td>                    Germany</td>
      <td>  12</td>
      <td> 117</td>
      <td>-105</td>
      <td> 105</td>
    </tr>
    <tr>
      <th>12 </th>
      <td>                New Zealand</td>
      <td>  13</td>
      <td>  38</td>
      <td> -25</td>
      <td>  25</td>
    </tr>
    <tr>
      <th>13 </th>
      <td>                Netherlands</td>
      <td>  14</td>
      <td>  72</td>
      <td> -58</td>
      <td>  58</td>
    </tr>
    <tr>
      <th>14 </th>
      <td>                     Latvia</td>
      <td>  15</td>
      <td>  60</td>
      <td> -45</td>
      <td>  45</td>
    </tr>
    <tr>
      <th>15 </th>
      <td>                     France</td>
      <td>  16</td>
      <td>  96</td>
      <td> -80</td>
      <td>  80</td>
    </tr>
    <tr>
      <th>16 </th>
      <td>                    Burundi</td>
      <td>  17</td>
      <td> 141</td>
      <td>-124</td>
      <td> 124</td>
    </tr>
    <tr>
      <th>17 </th>
      <td>               South Africa</td>
      <td>  18</td>
      <td> 103</td>
      <td> -85</td>
      <td>  85</td>
    </tr>
    <tr>
      <th>18 </th>
      <td>                     Canada</td>
      <td>  19</td>
      <td>  23</td>
      <td>  -4</td>
      <td>   4</td>
    </tr>
    <tr>
      <th>19 </th>
      <td>   United States of America</td>
      <td>  20</td>
      <td>  31</td>
      <td> -11</td>
      <td>  11</td>
    </tr>
    <tr>
      <th>20 </th>
      <td>                    Ecuador</td>
      <td>  21</td>
      <td> 109</td>
      <td> -88</td>
      <td>  88</td>
    </tr>
    <tr>
      <th>21 </th>
      <td>                   Bulgaria</td>
      <td>  22</td>
      <td>  53</td>
      <td> -31</td>
      <td>  31</td>
    </tr>
    <tr>
      <th>22 </th>
      <td>                   Slovenia</td>
      <td>  23</td>
      <td>  78</td>
      <td> -55</td>
      <td>  55</td>
    </tr>
    <tr>
      <th>23 </th>
      <td>                  Australia</td>
      <td>  24</td>
      <td>  18</td>
      <td>   6</td>
      <td>   6</td>
    </tr>
    <tr>
      <th>24 </th>
      <td>                    Moldova</td>
      <td>  25</td>
      <td>  68</td>
      <td> -43</td>
      <td>  43</td>
    </tr>
    <tr>
      <th>25 </th>
      <td>             United Kingdom</td>
      <td>  26</td>
      <td>  42</td>
      <td> -16</td>
      <td>  16</td>
    </tr>
    <tr>
      <th>26 </th>
      <td>                 Mozambique</td>
      <td>  27</td>
      <td>  61</td>
      <td> -34</td>
      <td>  34</td>
    </tr>
    <tr>
      <th>27 </th>
      <td>                 Luxembourg</td>
      <td>  28</td>
      <td> 107</td>
      <td> -79</td>
      <td>  79</td>
    </tr>
    <tr>
      <th>28 </th>
      <td>                      Spain</td>
      <td>  29</td>
      <td>  88</td>
      <td> -59</td>
      <td>  59</td>
    </tr>
    <tr>
      <th>29 </th>
      <td>                       Cuba</td>
      <td>  30</td>
      <td>  26</td>
      <td>   4</td>
      <td>   4</td>
    </tr>
    <tr>
      <th>30 </th>
      <td>                  Argentina</td>
      <td>  31</td>
      <td> 102</td>
      <td> -71</td>
      <td>  71</td>
    </tr>
    <tr>
      <th>31 </th>
      <td>                    Belarus</td>
      <td>  32</td>
      <td>  70</td>
      <td> -38</td>
      <td>  38</td>
    </tr>
    <tr>
      <th>32 </th>
      <td>                   Barbados</td>
      <td>  33</td>
      <td>  55</td>
      <td> -22</td>
      <td>  22</td>
    </tr>
    <tr>
      <th>33 </th>
      <td>                     Malawi</td>
      <td>  34</td>
      <td>  92</td>
      <td> -58</td>
      <td>  58</td>
    </tr>
    <tr>
      <th>34 </th>
      <td>                The Bahamas</td>
      <td>  35</td>
      <td>  36</td>
      <td>  -1</td>
      <td>   1</td>
    </tr>
    <tr>
      <th>35 </th>
      <td>                    Austria</td>
      <td>  36</td>
      <td>  82</td>
      <td> -46</td>
      <td>  46</td>
    </tr>
    <tr>
      <th>36 </th>
      <td>                      Kenya</td>
      <td>  37</td>
      <td>   9</td>
      <td>  28</td>
      <td>  28</td>
    </tr>
    <tr>
      <th>37 </th>
      <td>                    Lesotho</td>
      <td>  38</td>
      <td>  43</td>
      <td>  -5</td>
      <td>   5</td>
    </tr>
    <tr>
      <th>38 </th>
      <td>                   Portugal</td>
      <td>  39</td>
      <td>  95</td>
      <td> -56</td>
      <td>  56</td>
    </tr>
    <tr>
      <th>39 </th>
      <td>                    Namibia</td>
      <td>  40</td>
      <td> 112</td>
      <td> -72</td>
      <td>  72</td>
    </tr>
    <tr>
      <th>40 </th>
      <td>                 Madagascar</td>
      <td>  41</td>
      <td>  99</td>
      <td> -58</td>
      <td>  58</td>
    </tr>
    <tr>
      <th>41 </th>
      <td>                   Mongolia</td>
      <td>  42</td>
      <td>  71</td>
      <td> -29</td>
      <td>  29</td>
    </tr>
    <tr>
      <th>42 </th>
      <td>                 Kazakhstan</td>
      <td>  43</td>
      <td>  44</td>
      <td>  -1</td>
      <td>   1</td>
    </tr>
    <tr>
      <th>43 </th>
      <td>                  Lithuania</td>
      <td>  44</td>
      <td>  65</td>
      <td> -21</td>
      <td>  21</td>
    </tr>
    <tr>
      <th>44 </th>
      <td>                       Peru</td>
      <td>  45</td>
      <td>  97</td>
      <td> -52</td>
      <td>  52</td>
    </tr>
    <tr>
      <th>45 </th>
      <td>                     Panama</td>
      <td>  46</td>
      <td>  39</td>
      <td>   7</td>
      <td>   7</td>
    </tr>
    <tr>
      <th>46 </th>
      <td>                   Tanzania</td>
      <td>  47</td>
      <td>  16</td>
      <td>  31</td>
      <td>  31</td>
    </tr>
    <tr>
      <th>47 </th>
      <td>                 Costa Rica</td>
      <td>  48</td>
      <td> 129</td>
      <td> -81</td>
      <td>  81</td>
    </tr>
    <tr>
      <th>48 </th>
      <td>        Trinidad and Tobago</td>
      <td>  49</td>
      <td>  24</td>
      <td>  25</td>
      <td>  25</td>
    </tr>
    <tr>
      <th>49 </th>
      <td>                 Cape Verde</td>
      <td>  50</td>
      <td> 136</td>
      <td> -86</td>
      <td>  86</td>
    </tr>
    <tr>
      <th>50 </th>
      <td>                   Botswana</td>
      <td>  51</td>
      <td>  46</td>
      <td>   5</td>
      <td>   5</td>
    </tr>
    <tr>
      <th>51 </th>
      <td>                    Jamaica</td>
      <td>  52</td>
      <td>  21</td>
      <td>  31</td>
      <td>  31</td>
    </tr>
    <tr>
      <th>52 </th>
      <td>                   Colombia</td>
      <td>  53</td>
      <td>  63</td>
      <td> -10</td>
      <td>  10</td>
    </tr>
    <tr>
      <th>53 </th>
      <td>                     Serbia</td>
      <td>  54</td>
      <td>  62</td>
      <td>  -8</td>
      <td>   8</td>
    </tr>
    <tr>
      <th>54 </th>
      <td>                    Croatia</td>
      <td>  55</td>
      <td>  86</td>
      <td> -31</td>
      <td>  31</td>
    </tr>
    <tr>
      <th>55 </th>
      <td>                    Ukraine</td>
      <td>  56</td>
      <td>  79</td>
      <td> -23</td>
      <td>  23</td>
    </tr>
    <tr>
      <th>56 </th>
      <td>                     Poland</td>
      <td>  57</td>
      <td>  84</td>
      <td> -27</td>
      <td>  27</td>
    </tr>
    <tr>
      <th>57 </th>
      <td>                    Bolivia</td>
      <td>  58</td>
      <td> 128</td>
      <td> -70</td>
      <td>  70</td>
    </tr>
    <tr>
      <th>58 </th>
      <td>                  Singapore</td>
      <td>  59</td>
      <td>   8</td>
      <td>  51</td>
      <td>  51</td>
    </tr>
    <tr>
      <th>59 </th>
      <td>                       Laos</td>
      <td>  60</td>
      <td> 137</td>
      <td> -77</td>
      <td>  77</td>
    </tr>
    <tr>
      <th>60 </th>
      <td>                   Thailand</td>
      <td>  61</td>
      <td>  11</td>
      <td>  50</td>
      <td>  50</td>
    </tr>
    <tr>
      <th>61 </th>
      <td>                    Estonia</td>
      <td>  62</td>
      <td>  94</td>
      <td> -32</td>
      <td>  32</td>
    </tr>
    <tr>
      <th>62 </th>
      <td>                   Zimbabwe</td>
      <td>  63</td>
      <td>  35</td>
      <td>  28</td>
      <td>  28</td>
    </tr>
    <tr>
      <th>63 </th>
      <td>                     Guyana</td>
      <td>  64</td>
      <td> 134</td>
      <td> -70</td>
      <td>  70</td>
    </tr>
    <tr>
      <th>64 </th>
      <td>                     Israel</td>
      <td>  65</td>
      <td>  34</td>
      <td>  31</td>
      <td>  31</td>
    </tr>
    <tr>
      <th>65 </th>
      <td>                      Chile</td>
      <td>  66</td>
      <td>  47</td>
      <td>  19</td>
      <td>  19</td>
    </tr>
    <tr>
      <th>66 </th>
      <td>                 Kyrgyzstan</td>
      <td>  67</td>
      <td>  51</td>
      <td>  16</td>
      <td>  16</td>
    </tr>
    <tr>
      <th>67 </th>
      <td>                 Bangladesh</td>
      <td>  68</td>
      <td>  64</td>
      <td>   4</td>
      <td>   4</td>
    </tr>
    <tr>
      <th>68 </th>
      <td>                      Italy</td>
      <td>  69</td>
      <td> 105</td>
      <td> -36</td>
      <td>  36</td>
    </tr>
    <tr>
      <th>69 </th>
      <td>      Republic of Macedonia</td>
      <td>  70</td>
      <td> 125</td>
      <td> -55</td>
      <td>  55</td>
    </tr>
    <tr>
      <th>70 </th>
      <td>                     Brazil</td>
      <td>  71</td>
      <td>  57</td>
      <td>  14</td>
      <td>  14</td>
    </tr>
    <tr>
      <th>71 </th>
      <td>                    Romania</td>
      <td>  72</td>
      <td>  54</td>
      <td>  18</td>
      <td>  18</td>
    </tr>
    <tr>
      <th>72 </th>
      <td>                   Honduras</td>
      <td>  73</td>
      <td> 131</td>
      <td> -58</td>
      <td>  58</td>
    </tr>
    <tr>
      <th>73 </th>
      <td>                 Montenegro</td>
      <td>  74</td>
      <td>  81</td>
      <td>  -7</td>
      <td>   7</td>
    </tr>
    <tr>
      <th>74 </th>
      <td>                     Russia</td>
      <td>  75</td>
      <td>  52</td>
      <td>  23</td>
      <td>  23</td>
    </tr>
    <tr>
      <th>75 </th>
      <td>                    Vietnam</td>
      <td>  76</td>
      <td>  29</td>
      <td>  47</td>
      <td>  47</td>
    </tr>
    <tr>
      <th>76 </th>
      <td>                    Senegal</td>
      <td>  77</td>
      <td> 119</td>
      <td> -42</td>
      <td>  42</td>
    </tr>
    <tr>
      <th>77 </th>
      <td>         Dominican Republic</td>
      <td>  78</td>
      <td>  22</td>
      <td>  56</td>
      <td>  56</td>
    </tr>
    <tr>
      <th>78 </th>
      <td>                  Sri Lanka</td>
      <td>  79</td>
      <td>  80</td>
      <td>  -1</td>
      <td>   1</td>
    </tr>
    <tr>
      <th>79 </th>
      <td>                     Mexico</td>
      <td>  80</td>
      <td>  45</td>
      <td>  35</td>
      <td>  35</td>
    </tr>
    <tr>
      <th>80 </th>
      <td>                   Paraguay</td>
      <td>  81</td>
      <td> 132</td>
      <td> -51</td>
      <td>  51</td>
    </tr>
    <tr>
      <th>81 </th>
      <td>                    Uruguay</td>
      <td>  82</td>
      <td> 135</td>
      <td> -53</td>
      <td>  53</td>
    </tr>
    <tr>
      <th>82 </th>
      <td>                    Albania</td>
      <td>  83</td>
      <td> 115</td>
      <td> -32</td>
      <td>  32</td>
    </tr>
    <tr>
      <th>83 </th>
      <td>                El Salvador</td>
      <td>  84</td>
      <td> 113</td>
      <td> -29</td>
      <td>  29</td>
    </tr>
    <tr>
      <th>84 </th>
      <td>                    Georgia</td>
      <td>  85</td>
      <td>  91</td>
      <td>  -6</td>
      <td>   6</td>
    </tr>
    <tr>
      <th>85 </th>
      <td>                  Venezuela</td>
      <td>  86</td>
      <td>  12</td>
      <td>  74</td>
      <td>  74</td>
    </tr>
    <tr>
      <th>86 </th>
      <td> People's Republic of China</td>
      <td>  87</td>
      <td>  13</td>
      <td>  74</td>
      <td>  74</td>
    </tr>
    <tr>
      <th>87 </th>
      <td>                     Uganda</td>
      <td>  88</td>
      <td>  20</td>
      <td>  68</td>
      <td>  68</td>
    </tr>
    <tr>
      <th>88 </th>
      <td>                  Guatemala</td>
      <td>  89</td>
      <td> 120</td>
      <td> -31</td>
      <td>  31</td>
    </tr>
    <tr>
      <th>89 </th>
      <td>                   Slovakia</td>
      <td>  90</td>
      <td>  56</td>
      <td>  34</td>
      <td>  34</td>
    </tr>
    <tr>
      <th>90 </th>
      <td>                     Greece</td>
      <td>  91</td>
      <td>  83</td>
      <td>   8</td>
      <td>   8</td>
    </tr>
    <tr>
      <th>91 </th>
      <td>                  Swaziland</td>
      <td>  92</td>
      <td>  14</td>
      <td>  78</td>
      <td>  78</td>
    </tr>
    <tr>
      <th>92 </th>
      <td>                    Hungary</td>
      <td>  93</td>
      <td>  66</td>
      <td>  27</td>
      <td>  27</td>
    </tr>
    <tr>
      <th>93 </th>
      <td>                 Azerbaijan</td>
      <td>  94</td>
      <td> 106</td>
      <td> -12</td>
      <td>  12</td>
    </tr>
    <tr>
      <th>94 </th>
      <td>                     Cyprus</td>
      <td>  95</td>
      <td> 111</td>
      <td> -16</td>
      <td>  16</td>
    </tr>
    <tr>
      <th>95 </th>
      <td>             Czech Republic</td>
      <td>  96</td>
      <td>  87</td>
      <td>   9</td>
      <td>   9</td>
    </tr>
    <tr>
      <th>96 </th>
      <td>                  Indonesia</td>
      <td>  97</td>
      <td>  17</td>
      <td>  80</td>
      <td>  80</td>
    </tr>
    <tr>
      <th>97 </th>
      <td>                     Brunei</td>
      <td>  98</td>
      <td>  14</td>
      <td>  84</td>
      <td>  84</td>
    </tr>
    <tr>
      <th>98 </th>
      <td>                      Malta</td>
      <td>  99</td>
      <td> 122</td>
      <td> -23</td>
      <td>  23</td>
    </tr>
    <tr>
      <th>99 </th>
      <td>                     Belize</td>
      <td> 100</td>
      <td>  40</td>
      <td>  60</td>
      <td>  60</td>
    </tr>
    <tr>
      <th>100</th>
      <td>                      Ghana</td>
      <td> 101</td>
      <td> 133</td>
      <td> -32</td>
      <td>  32</td>
    </tr>
    <tr>
      <th>101</th>
      <td>                 Tajikistan</td>
      <td> 102</td>
      <td>  85</td>
      <td>  17</td>
      <td>  17</td>
    </tr>
    <tr>
      <th>102</th>
      <td>                    Armenia</td>
      <td> 103</td>
      <td> 126</td>
      <td> -23</td>
      <td>  23</td>
    </tr>
    <tr>
      <th>103</th>
      <td>                      Japan</td>
      <td> 104</td>
      <td>   4</td>
      <td> 100</td>
      <td> 100</td>
    </tr>
    <tr>
      <th>104</th>
      <td>                   Maldives</td>
      <td> 105</td>
      <td> 101</td>
      <td>   4</td>
      <td>   4</td>
    </tr>
    <tr>
      <th>105</th>
      <td>                  Mauritius</td>
      <td> 106</td>
      <td>   6</td>
      <td> 100</td>
      <td> 100</td>
    </tr>
    <tr>
      <th>106</th>
      <td>                   Malaysia</td>
      <td> 107</td>
      <td>   5</td>
      <td> 102</td>
      <td> 102</td>
    </tr>
    <tr>
      <th>107</th>
      <td>                   Cambodia</td>
      <td> 108</td>
      <td>  30</td>
      <td>  78</td>
      <td>  78</td>
    </tr>
    <tr>
      <th>108</th>
      <td>                   Suriname</td>
      <td> 109</td>
      <td>  74</td>
      <td>  35</td>
      <td>  35</td>
    </tr>
    <tr>
      <th>109</th>
      <td>               Burkina Faso</td>
      <td> 110</td>
      <td> 138</td>
      <td> -28</td>
      <td>  28</td>
    </tr>
    <tr>
      <th>110</th>
      <td>                    Liberia</td>
      <td> 111</td>
      <td>  50</td>
      <td>  61</td>
      <td>  61</td>
    </tr>
    <tr>
      <th>111</th>
      <td>                      Nepal</td>
      <td> 112</td>
      <td>   3</td>
      <td> 109</td>
      <td> 109</td>
    </tr>
    <tr>
      <th>112</th>
      <td>                     Kuwait</td>
      <td> 113</td>
      <td> 114</td>
      <td>  -1</td>
      <td>   1</td>
    </tr>
    <tr>
      <th>113</th>
      <td>                      India</td>
      <td> 114</td>
      <td>  19</td>
      <td>  95</td>
      <td>  95</td>
    </tr>
    <tr>
      <th>114</th>
      <td>       United Arab Emirates</td>
      <td> 115</td>
      <td> 118</td>
      <td>  -3</td>
      <td>   3</td>
    </tr>
    <tr>
      <th>115</th>
      <td>                      Qatar</td>
      <td> 116</td>
      <td> 139</td>
      <td> -23</td>
      <td>  23</td>
    </tr>
    <tr>
      <th>116</th>
      <td>                South Korea</td>
      <td> 117</td>
      <td>   1</td>
      <td> 116</td>
      <td> 116</td>
    </tr>
    <tr>
      <th>117</th>
      <td>                    Nigeria</td>
      <td> 118</td>
      <td>  92</td>
      <td>  26</td>
      <td>  26</td>
    </tr>
    <tr>
      <th>118</th>
      <td>                     Zambia</td>
      <td> 119</td>
      <td> 127</td>
      <td>  -8</td>
      <td>   8</td>
    </tr>
    <tr>
      <th>119</th>
      <td>                     Bhutan</td>
      <td> 120</td>
      <td>  33</td>
      <td>  87</td>
      <td>  87</td>
    </tr>
    <tr>
      <th>120</th>
      <td>                     Angola</td>
      <td> 121</td>
      <td>  90</td>
      <td>  31</td>
      <td>  31</td>
    </tr>
    <tr>
      <th>121</th>
      <td>                       Fiji</td>
      <td> 122</td>
      <td> 104</td>
      <td>  18</td>
      <td>  18</td>
    </tr>
    <tr>
      <th>122</th>
      <td>                    Tunisia</td>
      <td> 123</td>
      <td> 110</td>
      <td>  13</td>
      <td>  13</td>
    </tr>
    <tr>
      <th>123</th>
      <td>                    Bahrain</td>
      <td> 124</td>
      <td>   6</td>
      <td> 118</td>
      <td> 118</td>
    </tr>
    <tr>
      <th>124</th>
      <td>                     Turkey</td>
      <td> 125</td>
      <td>  41</td>
      <td>  84</td>
      <td>  84</td>
    </tr>
    <tr>
      <th>125</th>
      <td>                    Algeria</td>
      <td> 126</td>
      <td>  75</td>
      <td>  51</td>
      <td>  51</td>
    </tr>
    <tr>
      <th>126</th>
      <td>                   Ethiopia</td>
      <td> 127</td>
      <td>  10</td>
      <td> 117</td>
      <td> 117</td>
    </tr>
    <tr>
      <th>127</th>
      <td>                       Oman</td>
      <td> 128</td>
      <td>  28</td>
      <td> 100</td>
      <td> 100</td>
    </tr>
    <tr>
      <th>128</th>
      <td>                      Egypt</td>
      <td> 129</td>
      <td>  48</td>
      <td>  81</td>
      <td>  81</td>
    </tr>
    <tr>
      <th>129</th>
      <td>               Saudi Arabia</td>
      <td> 130</td>
      <td> 123</td>
      <td>   7</td>
      <td>   7</td>
    </tr>
    <tr>
      <th>130</th>
      <td>                 Mauritania</td>
      <td> 131</td>
      <td> 142</td>
      <td> -11</td>
      <td>  11</td>
    </tr>
    <tr>
      <th>131</th>
      <td>                     Guinea</td>
      <td> 132</td>
      <td> 140</td>
      <td>  -8</td>
      <td>   8</td>
    </tr>
    <tr>
      <th>132</th>
      <td>                    Morocco</td>
      <td> 133</td>
      <td> 100</td>
      <td>  33</td>
      <td>  33</td>
    </tr>
    <tr>
      <th>133</th>
      <td>                     Jordan</td>
      <td> 134</td>
      <td> 108</td>
      <td>  26</td>
      <td>  26</td>
    </tr>
    <tr>
      <th>134</th>
      <td>                    Lebanon</td>
      <td> 135</td>
      <td>  76</td>
      <td>  59</td>
      <td>  59</td>
    </tr>
    <tr>
      <th>135</th>
      <td>              Côte d'Ivoire</td>
      <td> 136</td>
      <td> 130</td>
      <td>   6</td>
      <td>   6</td>
    </tr>
    <tr>
      <th>136</th>
      <td>                       Iran</td>
      <td> 137</td>
      <td>  69</td>
      <td>  68</td>
      <td>  68</td>
    </tr>
    <tr>
      <th>137</th>
      <td>                       Mali</td>
      <td> 138</td>
      <td> 124</td>
      <td>  14</td>
      <td>  14</td>
    </tr>
    <tr>
      <th>138</th>
      <td>                      Syria</td>
      <td> 139</td>
      <td> 121</td>
      <td>  18</td>
      <td>  18</td>
    </tr>
    <tr>
      <th>139</th>
      <td>                       Chad</td>
      <td> 140</td>
      <td>  37</td>
      <td> 103</td>
      <td> 103</td>
    </tr>
    <tr>
      <th>140</th>
      <td>                   Pakistan</td>
      <td> 141</td>
      <td>  27</td>
      <td> 114</td>
      <td> 114</td>
    </tr>
    <tr>
      <th>141</th>
      <td>                      Yemen</td>
      <td> 142</td>
      <td>  88</td>
      <td>  54</td>
      <td>  54</td>
    </tr>
  </tbody>
</table>

Quite uncorrellated. That means that the data is not good, or that the world economic forum methods have little to do with the percentage of women born in those countries recorded semantically on a historic level. And /rho is high


In [6]:
country_map = pd.DataFrame.from_csv('helpers/aggregation_maps/country_maps.csv')

In [7]:
def map_culture(qid):
    if not type(qid) is str:
        return None
    else:
        country_list = pobs_map[qid]
        if len(country_list) == 0:
            return None
        else:
            country = country_list[0] #assumption
            culture = country_map.ix[country]['culture_name']
            return culture

In [15]:
allrecs['culture'] = allrecs['place_of_birth'].apply(map_culture)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-15-ce3e2a640676> in <module>()
----> 1 allrecs['culture'] = allrecs['place_of_birth'].apply(map_culture)

/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in apply(self, func, convert_dtype, args, **kwds)
   2056             values = lib.map_infer(values, lib.Timestamp)
   2057 
-> 2058         mapped = lib.map_infer(values, f, convert=convert_dtype)
   2059         if len(mapped) and isinstance(mapped[0], Series):
   2060             from pandas.core.frame import DataFrame

/usr/local/lib/python2.7/dist-packages/pandas/lib.so in pandas.lib.map_infer (pandas/lib.c:57158)()

<ipython-input-14-dd8662bb0567> in map_culture(qid)
      3         return None
      4     else:
----> 5         country_list = pobs_map[qid]
      6         if len(country_list) == 0:
      7             return None

NameError: global name 'pobs_map' is not defined

In [32]:
import math
import pywikibot
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()

retrieved = dict()

def english_label(qid):
    if qid:
        if type(qid) is float:
            if math.isnan(qid):
                return None
        #first see if we've done it
        try:
            return retrieved[qid]
        except KeyError:
            try:
                page = pywikibot.ItemPage(wikidata, qid)
                data = page.get()
                lab = data['labels']['en']
                retrieved[qid] = lab
                return lab
            except KeyError:
                retrieved[qid] = qid
                return qid
    else:
        return None

In [33]:
english_label('Q6581097')


Out[33]:
u'male'

In [34]:
allrecs['gender_name'] = allrecs['gender'].apply(english_label)


VERBOSE:pywiki:Found 1 commons:commons processes running, including this one.

In [ ]:
outdf = allrecs[['gender_name','culture']]

In [ ]:
outdf.to_csv('helpers/Chi_Squared_Test_Data.csv')

how many records have gender, pob and dob

Making the 3bin genpobdob


In [9]:
has = defaultdict(dict)
for col in allrecs.columns:
    def test(x):
        if isinstance(x, float):
            return not math.isnan(x)
        else:
            return x is not None
    nonempty = len(allrecs[allrecs[col].apply(test)])
    nonemptyper = nonempty / float(len(allrecs))
    
    has[col]['Items with property'] = nonempty
    has[col]['% of total'] = nonemptyper

hasdf = pd.DataFrame.from_dict(has, orient='index')

In [10]:
print hasdf.sort('% of total').to_html(justify='right', formatters={'% of total':lambda x: '%.2f' % (x*100), 
                                                   'Items with property':lambda x: '{0:,}'.format(x)})


<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>% of total</th>
      <th>Items with property</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>ethnic_group</th>
      <td>  0.30</td>
      <td>    7,772</td>
    </tr>
    <tr>
      <th>country</th>
      <td> 23.47</td>
      <td>  601,361</td>
    </tr>
    <tr>
      <th>place_of_birth</th>
      <td> 23.93</td>
      <td>  613,092</td>
    </tr>
    <tr>
      <th>dod</th>
      <td> 28.79</td>
      <td>  737,522</td>
    </tr>
    <tr>
      <th>citizenship</th>
      <td> 41.44</td>
      <td>1,061,634</td>
    </tr>
    <tr>
      <th>culture</th>
      <td> 45.20</td>
      <td>1,158,086</td>
    </tr>
    <tr>
      <th>dob</th>
      <td> 57.92</td>
      <td>1,484,003</td>
    </tr>
    <tr>
      <th>gender</th>
      <td> 89.40</td>
      <td>2,290,433</td>
    </tr>
    <tr>
      <th>site_links</th>
      <td> 99.05</td>
      <td>2,537,545</td>
    </tr>
    <tr>
      <th>qid</th>
      <td>100.00</td>
      <td>2,561,999</td>
    </tr>
  </tbody>
</table>

In [11]:
hasdob = allrecs[allrecs['dob'].apply(lambda x: not math.isnan(x))]
len(hasdob)


Out[11]:
1484003

In [12]:
hasgender = hasdob[hasdob['gender'].apply(lambda x: not math.isnan(x) if type(x) is float else True)]
len(hasgender)


Out[12]:
1484003

In [13]:
hascult = hasgender[hasgender['culture'].apply(lambda x: x is not None)]
len(hascult)


Out[13]:
915101

In [14]:
hascult.head()


Out[14]:
citizenship country culture dob dod ethnic_group gender place_of_birth qid site_links
0 Q30 Q30 english-speaking 1732 1799 None Q6581097 Q494413 Q23 zhwiki|kywiki|euwiki|plwiki|bswiki|angwiki|uzw...
1 Q145 Q145 english-speaking 1952 2001 None Q6581097 Q350 Q42 zhwiki|jvwiki|euwiki|plwiki|bswiki|eswiki|tawi...
10 Q30 Q30 english-speaking 1973 NaN None Q6581072 Q1020700 Q555 zhwiki|eowiki|plwiki|kowiki|ruwiki|frwiki|eswi...
100 Q36 None catholic european 1989 NaN None Q6581097 None Q2327 dewiki|plwiki|ruwiki|enwiki|ocwiki|svwiki|dawiki|
1000 Q21 None english-speaking 1944 1994 None Q6581097 None Q28348 ptwiki|plwiki|kowiki|hewiki|frwiki|ruwiki|eswi...

In [15]:
culture_groups = hascult.groupby('culture')

In [16]:
def make_perc_series(df):
    years_per = dict()
    dobs = df.groupby('dob')
    #hate to use a for loop, fixlater
    for year, group in dobs:
        nmcount = group[group['gender'] != 'Q6581097']['gender'].count()
        totalcount = group['gender'].count()
        nmper = nmcount / float(totalcount)
        years_per[year] = nmper
        perc_series = pd.TimeSeries(data=years_per)
    
    return perc_series
    
perc_dict = dict()
for name, group in culture_groups:
    perc_series = make_perc_series(group)
    perc_dict[name] = perc_series

In [34]:
perc_df.tail(10)


Out[34]:
africa catholic european confucian english-speaking islamic latin america orthodox protestant european south asia euro
2007 0 1.000000 0 0.6 1 NaN NaN 0.750000 0 1.750000
2008 NaN 1.000000 NaN 1.0 NaN NaN NaN NaN NaN NaN
2009 NaN 0.000000 NaN 0.0 NaN NaN NaN 0.000000 0 0.000000
2010 0 NaN NaN 0.4 NaN NaN 0 0.000000 0 NaN
2011 NaN 0.333333 NaN 0.0 NaN 1 0 0.250000 1 0.583333
2012 NaN NaN NaN 0.0 0 NaN 0 0.666667 0 NaN
2013 NaN 0.000000 NaN 0.0 NaN 0 0 NaN 0 NaN
2014 NaN NaN NaN 0.5 NaN NaN NaN NaN NaN NaN
2411 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2426 NaN NaN NaN NaN 0 NaN 0 NaN NaN NaN

In [35]:
perc_df = pd.DataFrame.from_dict(perc_dict)
years = range(1800,2000,int(200/6.0))
subbd_df = perc_df.ix[years]
infogram = subbd_df
infogram.to_csv('Magnus Gender analysis/infogram_pob_dob_cult.csv',index=True, encoding='utf-8')

In [37]:
fig, (full, modern) = plt.subplots(1,2, figsize=(20,6))

end_year = 2000

for start_year, ra_len, ax in zip((-1000,1800), (100,10), (full, modern)):
    ra_dict = dict()
    for name, series in perc_dict.iteritems():
        ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=10)
    cult_dob_per  = pd.DataFrame(ra_dict)
    
    if start_year == 1800:
        year_list = range(1900,end_year,10)
        cult_dob_per.ix[years].to_csv('Magnus Gender analysis/infogram_pob_dob_cult.csv',index=True, encoding='utf-8')
    

    cult_dob_per.plot(cmap='Paired', linewidth=2, ax=ax, legend=False,zorder=-ra_len)
    ax.set_xlim((start_year, end_year))
    ax.set_xticks(range(start_year, end_year,(end_year-start_year) / 16))
    ax.set_ylim((0,0.6))
    ax.set_title(u'{}{}, with {} year Rolling Average'.format(start_year, end_year,ra_len))
    ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))

full.legend = legend(bbox_to_anchor=(0.05, 0.95), loc=2, borderaxespad=0)
full.set_xticks(range(-1000, end_year,(end_year+1000) / 15))
fig.suptitle('Female % of Biographies by Culture, over Time', fontsize=24)
fig.subplots_adjust(top=0.88)



In [27]:
fig, (full, modern) = plt.subplots(2,1, figsize=(12,8), sharex=False)

end_year = 2000

for start_year, ra_len, ax in zip((-1000,1800), (100,10), (full, modern)):
    ra_dict = dict()
    for name, series in perc_dict.iteritems():
        ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=10)
    cult_dob_per  = pd.DataFrame(ra_dict)
    cult_dob_per.plot(cmap='Paired', linewidth=2, ax=ax, legend=False,zorder=-ra_len)
    ax.set_xlim((start_year, end_year))
    ax.set_xticks(range(start_year, end_year,(end_year-start_year) / 16))
    ax.set_ylim((0,0.6))
    ax.set_title(u'{}{}, with {} year Rolling Average'.format(start_year, end_year,ra_len))
    ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))

full.legend = legend(bbox_to_anchor=(0.05, 0.95), loc=2, borderaxespad=0)
#full.set_xticks(range(-1000, end_year,(end_year+1000) / 15))
fig.suptitle('Female % of Biographies by Culture, over Time', fontsize=24)
fig.subplots_adjust(top=0.88)


Make dobculture


In [184]:
dobexists = allrecs[allrecs['dob'].apply(lambda x: not math.isnan(x))]
dobcultureexists = dobexists[dobexists['culture'].apply(lambda x: x is not None)]
len(dobcultureexists)


Out[184]:
915101

In [185]:
culture_groups = dobcultureexists[['dob','culture']].groupby(by='culture')

In [186]:
def make_tot_series(df):
    years_tot = dict()
    dobs = df.groupby('dob')
    #hate to use a for loop, fixlater
    for year, group in dobs:
        totalcount = group['culture'].count()
        years_tot[year] = totalcount
        tot_series = pd.TimeSeries(data=years_tot)
    
    return tot_series
    
tot_dict = dict()
for name, group in culture_groups:
    tot_dict[name] = make_tot_series(group)

In [189]:
end_year = 2014
for start_year in [1500, 1800]:
    for ra_len in [2, 5, 10]:
        ra_dict = dict()
        for name, series in tot_dict.iteritems():
            ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=1)

        cult_dob = pd.DataFrame(ra_dict)
        plt = cult_dob.plot(figsize=(20,6),  cmap='Set2', linewidth=1.5)
        plt.set_xlim((start_year,end_year))
        plt.set_xticks(range(start_year,end_year,(end_year-start_year) / 15))
        plt.set_title('Total Biographies by Date of Birth |  %s Year Rolling Average' % str(ra_len))
        plt.legend(loc=2)



In [188]:
for start_year, end_year in zip([-2000, -1000], [1000,1500]):
    for ra_len in [1,2,10]:
        ra_dict = dict()
        for name, series in tot_dict.iteritems():
            ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=1)
        cult_dob = pd.DataFrame(ra_dict)
        plt = cult_dob.plot(figsize=(20,6),  cmap='Set2', linewidth=1.5)
        plt.set_ylim((0,50))
        plt.set_yscale('log')
        plt.set_xlim((start_year,end_year))
        plt.set_xticks(range(start_year,end_year,(end_year-start_year) / 15))
        plt.set_title('Total Biographies by Date of Birth |  %s Year Rolling Average' % str(ra_len))
        plt.legend(loc=2)


/usr/local/lib/python2.7/dist-packages/numpy/ma/core.py:3895: UserWarning: Warning: converting a masked element to nan.
  warnings.warn("Warning: converting a masked element to nan.")

In [ ]: